### Jonathan D. Klingler, Gary E. Hollibaugh, Jr., and Adam J. Ramey
### "Don't Know What You Got: A Bayesian Hierarchical Model of Neuroticism and Ideological Uncertainty
### Political Science Research and Methods
###
###
### Preprocessing and Summary Statistics (Tables B-1 through B-3)
### Description: This file takes the raw .sav file and converts it to a .Rda file for later processing
###
### Note 1: Run this file first before the others
### Note 2: Make sure to change the directory to the one containing this file


rm(list=ls())
library(memisc)
library(xtable)
library(plyr)
library(rio)
library(foreign)

# loading the original dataset
cces_big <- suppressWarnings(read.spss("CCES14_NYU_OUTPUT_Feb2015.sav", use.value.labels = TRUE, to.data.frame = TRUE, use.missings = FALSE))

# pulling out the relevant variables for the tables in appendix B
summary_employ <- cces_big$employ
summary_gender <- cces_big$gender
summary_birthyr <- cces_big$birthyr
summary_age <- 2014 - summary_birthyr
summary_race <- cces_big$race
summary_income <- cces_big$faminc
summary_news <- cces_big$newsint
summary_news[summary_news %in% c("Skipped", "Not Asked")] <- NA
summary_tipi_extraverted <- ifelse(as.numeric(cces_big$NYA333) < 8, as.numeric(cces_big$NYA333), NA) 
summary_tipi_critical <- ifelse(as.numeric(cces_big$NYA334) < 8, as.numeric(cces_big$NYA334), NA) 
summary_tipi_dependable <- ifelse(as.numeric(cces_big$NYA335) < 8, as.numeric(cces_big$NYA335), NA) 
summary_tipi_anxious <- ifelse(as.numeric(cces_big$NYA336) < 8, as.numeric(cces_big$NYA336), NA) 
summary_tipi_open <- ifelse(as.numeric(cces_big$NYA337) < 8, as.numeric(cces_big$NYA337), NA) 
summary_tipi_reserved <- ifelse(as.numeric(cces_big$NYA338) < 8, as.numeric(cces_big$NYA338), NA)
summary_tipi_sympathetic <- ifelse(as.numeric(cces_big$NYA339) < 8, as.numeric(cces_big$NYA339), NA)
summary_tipi_disorganized <- ifelse(as.numeric(cces_big$NYA340) < 8, as.numeric(cces_big$NYA340), NA)
summary_tipi_calm <- ifelse(as.numeric(cces_big$NYA341) < 8, as.numeric(cces_big$NYA341), NA)
summary_tipi_conventional <- ifelse(as.numeric(cces_big$NYA342) < 8, as.numeric(cces_big$NYA342), NA)
summary_extra <- punif((summary_tipi_extraverted + (8 - summary_tipi_reserved)),2,14)
summary_agree <- punif((summary_tipi_sympathetic + (8 - summary_tipi_critical)),2,14)
summary_consc <- punif((summary_tipi_dependable + (8 - summary_tipi_disorganized)),2,14)
summary_neuro <- 1-punif((summary_tipi_calm + (8 - summary_tipi_anxious)),2,14)
summary_openn <- punif((summary_tipi_open + (8 - summary_tipi_conventional)),2,14)
summary_educ <- cces_big$educ

# recoding income to make it more consistent (for some reason, several cutoffs were used and some overlap)
summary_income[which(summary_income == "$150,000 - $199,999")] <- levels(summary_income)[17] 
summary_income[which(summary_income == "$200,000 - $249,999")] <- levels(summary_income)[17] 
summary_income[which(summary_income == "$250,000 - $349,999")] <- levels(summary_income)[17] 
summary_income[which(summary_income == "$350,000 - $499,999")] <- levels(summary_income)[17] 
summary_income[which(summary_income == "$500,000 or more")] <- levels(summary_income)[17] 
summary_income[which(summary_income == "$250,000 or more ")] <- levels(summary_income)[17] 
summary_income <- factor(summary_income)


# table b-1
placements <- data.frame(rbind(table(cces_big$CC334A), 
                               table(cces_big$CC334C),
                               table(cces_big$CC334D),
                               table(cces_big$CC334E),
                               table(cces_big$CC334F),
                               table(cces_big$CC334G),
                               table(cces_big$CC334K),
                               table(cces_big$CC334L),
                               table(cces_big$CC334M),
                               table(cces_big$CC334W)),
                         row.names = c("Self",
                                       "Obama",
                                       "Clinton",
                                       "Cruz",
                                       "Paul",
                                       "Bush",
                                       "Democratic Party",
                                       "Republican Party",
                                       "Tea Party",
                                       "Supreme Court"))

placements$"Don't Know/Skipped" <- placements$Not.sure + placements$Skipped
placements <- subset(placements, select = -c(Not.sure, Skipped))[,c(1:7,9,8)]
placements$"Total Asked" <- apply(subset(placements, select = -c(Not.Asked)), 1, sum)
print(xtable(placements, digits = 3), type = "html", file = "table-b1.html")


# creating a mode function
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}


summary_stats <- function(x){
	foo_mean <- mean(x, na.rm = TRUE)
	foo_median <- median(x, na.rm = TRUE)
	foo_mode <- Mode(x)
	foo_min <- min(x, na.rm = TRUE)
	foo_max <- max(x, na.rm = TRUE)
	foo_sd <- sd(x, na.rm = TRUE)
	foo_placements <- sum(!is.na(x))
	foo_out <- data.frame(Mean = foo_mean,
	                      Median = foo_median,
	                      Mode = foo_mode,
	                      Minimum = foo_min,
	                      Maximum = foo_max,
	                      SD = foo_sd,
	                      Placements = foo_placements) 
	return(round(foo_out, digits = 3))
	}
	


# table b-2
tipi.table <- apply(data.frame(Openness = as.numeric(summary_openn), 
                               Conscientiousness = as.numeric(summary_consc), 
                               Extraversion = as.numeric(summary_extra), 
                               Agreeableness = as.numeric(summary_agree), 
                               Neuroticism = as.numeric(summary_neuro)), 2, summary_stats)
tipi.table <- ldply(tipi.table, data.frame)
colnames(tipi.table)[1] <- "Personality Trait"
print(xtable(tipi.table, digits = 3), type = "html", file = "table-b2.html")

# table b-3
other.table <- apply(data.frame(Female = as.numeric(summary_gender == "Female"),
                                Age = as.numeric(summary_age),
                                Black = as.numeric(summary_race == "Black"),
                                Hispanic = as.numeric(summary_race == "Hispanic"),
                                "Other Race" = as.numeric(!(summary_race %in% c("White", "Black", "Hispanic"))),
                                Education = as.numeric(summary_educ),
                                "High News Interest" = as.numeric(summary_news == "Most of the time"),
                                "Unknown News Interest" = as.numeric(summary_news == "Don't know"),
                                Income = as.numeric(summary_income),
                                "Income Refused" = as.numeric(cces_big$faminc == "Prefer not to say"), 
                                "Employed Full-Time" = as.numeric(summary_employ == "Full-time"),
                                "Employed Part-Time" = as.numeric(summary_employ == "Part-time"),
                                Unemployed = as.numeric(summary_employ == "Unemployed"),
                                Retired = as.numeric(summary_employ == "Retired")), 2, summary_stats)
other.table <- ldply(other.table, data.frame)
colnames(other.table)[1] <- "Variable"
print(xtable(other.table, digits = 3), type = "html", file = "table-b3.html")
      
      
# reconverting big five back to seven-point scale to be consistent with other code
# rounding is due to machine precision issues  
cces_big$self_extra <- round(as.numeric(1 + summary_extra*6), digits = 10)
cces_big$self_agree <- round(as.numeric(1 + summary_agree*6), digits = 10)
cces_big$self_consc <- round(as.numeric(1 + summary_consc*6), digits = 10)
cces_big$self_emoti <- round(as.numeric(1 + (1-summary_neuro)*6), digits = 10)
cces_big$self_openn <- round(as.numeric(1 + summary_openn*6), digits = 10)


cces <- cces_big[!is.na(cces_big $CC421a) & 
                 !is.na(cces_big$self_extra) & 
                 !is.na(cces_big$self_emoti) & 
                 !is.na(cces_big$self_openn)& 
                 !is.na(cces_big$self_agree) & 
                 !is.na(cces_big$self_consc),]  

save(cces, file = "CCES2014.Rda")                 
                                          